/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.GeneralizedString;
import mosdi.fa.GeneralizedStringsHammingNFA;
import mosdi.fa.GeneralizedStringsNFA;
import mosdi.fa.NFA;
import mosdi.util.Iupac;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class GeneratePfmSubcommand extends Subcommand {

	@Override
	public String usage() {
		return
		super.usage()+" [options] <iupac-pattern> <fasta-file>\n" +
		"\n" +
		"Options:\n" +
		"  -r: also output matches of reverse complementary motif\n" +
		"  -f <output-filename>: create fasta file with matching sequences\n" +
		"                        (e.g. to be fed into http://weblogo.berkeley.edu)\n" +
		"                        Use \"-\" for standard out.\n" +
		"  -h: add header row and column\n" +
		"  -H <hamming-distance>: add Hamming neighborhood to patterns\n" +
		"  -N <nodelimit>: if exceeded, construction of automaton will aborted (default: 1000)";
	}

	@Override
	public String description() {
		return
		"Given a IUPAC pattern and sequences, creates a position frequency " +
		"matrix (PFM) for all matches of the pattern in the sequences.";
	}

	@Override
	public String name() {
		return "generate-pfm";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 2, "rf:hH:N:");

		// Option dependencies
		// -- none --

		// Mandatory arguments
		String pattern = getStringArgument(0);
		String inputFilename = getStringArgument(1);

		// Options
		boolean considerReverse = getBooleanOption("r", false);
		String outputFilename = getStringOption("f", null);
		boolean addHeader = getBooleanOption("h", false);
		int hammingDistance = getNonNegativeIntOption("H", 0);
		int nodeLimit = getPositiveIntOption("N", 1000);

		BufferedWriter fastaFile = null;
		if (outputFilename!=null) {
			if (outputFilename.equals("-")) {
				fastaFile = new BufferedWriter(new OutputStreamWriter(System.out));
			} else {
				try {
					fastaFile = new BufferedWriter(new FileWriter(outputFilename));
				} catch (IOException e) {
					Log.errorln(String.format("Cannot open file \"%s\" for writing", outputFilename));
					System.exit(1);
				}
			}
		}

		Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		List<NamedSequence> namedSequences = null;
		try {
			namedSequences = SequenceUtils.readFastaFile(inputFilename, dnaAlphabet);
		} catch (Exception e) {
			Log.errorln(e.getMessage());
			System.exit(1);
		}
		List<int[]> sequences = SequenceUtils.sequenceList(namedSequences);

		// construct automaton
		List<GeneralizedString> l = new ArrayList<GeneralizedString>(1);
		l.add(Iupac.toGeneralizedString(pattern));
		NFA nfa;
		if (hammingDistance==0) {
			nfa = new GeneralizedStringsNFA(l); 
		} else {
			nfa = new GeneralizedStringsHammingNFA(l, hammingDistance);
		}
		CDFA cdfa = DFAFactory.build(dnaAlphabet, nfa, nodeLimit);
		int[][] pfm = new int[pattern.length()][];
		for (int i=0; i<pattern.length(); ++i) pfm[i] = new int[4]; 
		for (int[] s : sequences) {
			List<CDFA.MatchPosition> matchPositions = cdfa.findMatchPositions(s);
			for (CDFA.MatchPosition mp : matchPositions) {
				int start = mp.getPosition()-pattern.length()+1;
				for (int i=0; i<pattern.length(); ++i) {
					pfm[i][s[start+i]]+=1;
				}
				if (fastaFile!=null) {
					try {
						fastaFile.write(">\n");
						fastaFile.write(dnaAlphabet.buildString(Arrays.copyOfRange(s, start, start+pattern.length())));
						fastaFile.write("\n");
					} catch (IOException e) {
						Log.errorln("Error while writing fasta file!");
						System.exit(1);
					}
				}
			}
			if (considerReverse) {
				int[] reverseS = dnaAlphabet.buildIndexArray(Iupac.reverseComplementary(dnaAlphabet.buildString(s)));
				List<CDFA.MatchPosition> reverseMatchPositions = cdfa.findMatchPositions(reverseS);
				for (CDFA.MatchPosition mp : reverseMatchPositions) {
					int start = mp.getPosition()-pattern.length()+1;
					for (int i=0; i<pattern.length(); ++i) {
						pfm[i][reverseS[start+i]]+=1;
					}
					if (fastaFile!=null) {
						try {
							fastaFile.write(">\n");
							fastaFile.write(dnaAlphabet.buildString(Arrays.copyOfRange(reverseS, start, start+pattern.length())));
							fastaFile.write("\n");
						} catch (IOException e) {
							Log.errorln("Error while writing fasta file!");
							System.exit(1);
						}
					}
				}
			}
		}
		StringBuilder sb = new StringBuilder();
		if (addHeader) {
			sb.append('\t');
			for (int i=0; i<pattern.length(); ++i) {
				sb.append(i);
				sb.append('\t');
			}
			sb.append('\n');
		}
		for (int c=0; c<4; ++c) {
			if (addHeader) {
				sb.append(dnaAlphabet.get(c));
				sb.append('\t');
			}
			for (int i=0; i<pattern.length(); ++i) {
				sb.append(pfm[i][c]);
				sb.append('\t');
			}
			sb.append('\n');
		}
		Log.print(Log.Level.STANDARD, sb.toString());
		if (fastaFile!=null) {
			try { fastaFile.flush(); }
			catch (IOException e) {
				Log.errorln("Error while writing fasta file!");
				System.exit(1);				
			}
		}
		return 0;
	}	
}
